Load libraries
This project uses renv to keep track of installed packages. Install renv if not installed and load dependencies with renv::restore().
install.packages ("renv" )
renv:: restore ()
Read sample data
Get list of samples
Code
samples <- read_tsv ("config/samples.tsv" ,
col_types = list (
sample_name = col_character (),
cell_line = col_factor (),
exogenous_rna = col_factor (),
day = col_factor ()
)
)
units <- read_tsv ("config/units.tsv" ,
col_types = list (
sample_name = col_character (),
unit_name = col_character (),
fq1 = col_character (),
fq2 = col_character ()
)
)
sample_units <- dplyr:: left_join (samples, units, by = "sample_name" ) %>%
unite (sample_unit, sample_name, unit_name, remove = FALSE ) %>%
mutate (
day = as.factor (paste0 ("day" , day)),
batch = as.factor (case_when (
exogenous_rna == "mastermix1" ~ "batch3" ,
exogenous_rna == "mastermix2" ~ "batch3" ,
TRUE ~ "batch2"
)),
.before = cell_line,
)
sample_units
Table of exogenous RNA mixtures
Code
# Exogenous RNA mixtures
rna_mixes <- tibble ()
for (mix in c ("mastermix1" , "mastermix2" , "PJY103_mDNMT1" , "PJY300_mDNMT1" )) {
t <- Biostrings:: readDNAStringSet (sprintf ("data/references/%s.fa" , mix))
rna_mixes <- rbind (rna_mixes, tibble (
exogenous_rna = mix,
rna_species = word (t@ ranges@ NAMES, 1 ),
start = 1 ,
end = t@ ranges@ width
))
}
rna_mixes <- rna_mixes %>%
mutate (
active_start_min = case_when (
exogenous_rna == "mastermix1" ~ 11 ,
exogenous_rna == "mastermix2" ~ 11 ,
exogenous_rna == "PJY103_mDNMT1" ~ 11 ,
exogenous_rna == "PJY300_mDNMT1" ~ 11
),
active_start_max = case_when (
exogenous_rna == "mastermix1" ~ 14 ,
exogenous_rna == "mastermix2" ~ 14 ,
exogenous_rna == "PJY103_mDNMT1" ~ 15 ,
exogenous_rna == "PJY300_mDNMT1" ~ 15
),
cyptic_terminator_end = case_when (
exogenous_rna == "mastermix1" ~ 37 ,
exogenous_rna == "mastermix2" ~ 37 ,
exogenous_rna == "PJY103_mDNMT1" ~ 38 ,
exogenous_rna == "PJY300_mDNMT1" ~ 38
),
)
rna_mixes
Exogenous RNA counts
BAM reading functions
Code
rna_species_plot_range <- function (sequence_name) {
plot_range <- rna_mixes %>%
dplyr:: filter (.data$ rna_species == sequence_name) %>%
dplyr:: select (start, end) %>%
unique ()
return (plot_range)
}
# GRanges from BAM
granges_from_bam <- function (sample_unit,
sequence_name,
is_proper_pair = TRUE ,
bam_dir =
"results/alignments/exogenous_rna/sorted" ) {
plot_range <- rna_species_plot_range (sequence_name)
which <- GRanges (
sprintf ("%s:%i-%i" , sequence_name, plot_range$ start, plot_range$ end)
)
param <- ScanBamParam (
flag = scanBamFlag (isProperPair = is_proper_pair),
mapqFilter = 1 ,
which = which
)
aligned_fragments_list <- list ()
if (is_proper_pair) {
aligned_fragments <- granges (
readGAlignmentPairs (
sprintf ("%s/%s.bam" , bam_dir, sample_unit),
param = param
),
on.discordant.seqnames = "drop"
)
rna_info <- rna_mixes %>%
dplyr:: filter (rna_species == {{ sequence_name }}) %>%
dplyr:: select (- "exogenous_rna" ) %>%
unique ()
aligned_fragments_list$ active <- aligned_fragments %>%
plyranges:: filter (
start <= rna_info$ active_start_max,
end >= rna_info$ cyptic_terminator_end
)
aligned_fragments_list$ inactive <- aligned_fragments %>%
plyranges:: filter (
start > rna_info$ active_start_max,
end >= rna_info$ cyptic_terminator_end
)
aligned_fragments_list$ premature_termination <- aligned_fragments %>%
plyranges:: filter (end < rna_info$ cyptic_terminator_end)
} else {
aligned_fragments <- granges (
readGAlignments (
sprintf ("%s/%s.bam" , bam_dir, sample_unit),
param = param
)
)
aligned_fragments_list$ discordant <- aligned_fragments
}
return (aligned_fragments_list)
}
Read BAM coverage
Code
rna_species_plot_data <- list ()
for (rna_species_to_plot in rna_mixes$ rna_species) {
rna_info <- rna_mixes %>%
dplyr:: filter (rna_species == rna_species_to_plot)
sample_units_to_plot <- sample_units %>%
dplyr:: filter (exogenous_rna %in% rna_info$ exogenous_rna)
granges_to_plot <- list ()
for (sample_unit in sample_units_to_plot$ sample_unit) {
granges_to_plot[[sample_unit]] <- granges_from_bam (
sample_unit,
rna_species_to_plot,
TRUE
)
}
rna_species_plot_data[[rna_species_to_plot]] <- granges_to_plot
}
Organize coverage data
Code
exogenous_rna_count_data <- tibble (
sample_unit = character (),
sequence_name = character (),
category = character (),
count = numeric ()
)
for (rna_species in names (rna_species_plot_data)) {
for (sample_unit in names (rna_species_plot_data[[rna_species]])) {
for (category in
names (rna_species_plot_data[[rna_species]][[sample_unit]])) {
exogenous_rna_count_data <- exogenous_rna_count_data %>%
add_row (
sample_unit = sample_unit,
sequence_name = rna_species,
category = category,
count = length (
rna_species_plot_data[[rna_species]][[sample_unit]][[category]]
)
)
}
}
}
exogenous_rna_count_data
Summarize coverage data
Code
exogenous_rna_mapped_totals <- exogenous_rna_count_data %>%
group_by (sample_unit, sequence_name) %>%
summarize (mapped_fragments = sum (count))
`summarise()` has grouped output by 'sample_unit'. You can override using the
`.groups` argument.
Code
exogenous_rna_mapped_totals
Human small RNA gene counts
count only fragments that were properly aligned
annotate with GENCODE gene model
primary alignments were counted, even if the fragments aligned multiple times
fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
human_gene_counts_files <- paste0 (
human_counts_dir,
sample_units$ sample_unit,
"_first_proper_pair_gene_count.txt"
)
human_gene_counts <- readr:: read_tsv (
human_gene_counts_files[1 ],
comment = "#" ,
col_names = c ("gene" , human_gene_counts_files[1 ]),
col_types = "ci"
)
for (i in 2 : length (human_gene_counts_files)) {
gene_sample <-
readr:: read_tsv (
human_gene_counts_files[i],
comment = "#" ,
col_names = c ("gene" , human_gene_counts_files[i]),
col_types = "ci"
)
human_gene_counts <- human_gene_counts %>%
dplyr:: full_join (gene_sample, by = "gene" )
}
human_gene_counts <- human_gene_counts %>%
rename_all (~ stringr:: str_replace_all (
., human_counts_dir, ""
)) %>%
rename_all (~ str_replace_all (., "_first_proper_pair_gene_count.txt" , "" ))
exogenous_gene_counts <- exogenous_rna_count_data %>%
tidyr:: unite (gene, c (sequence_name, category), sep = ":" ) %>%
tidyr:: spread (sample_unit, count)
gene_counts <- rbind (human_gene_counts, exogenous_gene_counts)
gene_counts
Exogenous RNA “gene” names
Code
# List of exogenous genes to highlight
exogenous_rna_names <- gene_counts %>%
dplyr:: filter (str_detect (gene, "^PJY" )) %>%
pull (gene)
exogenous_rna_names
[1] "PJY103_mDNMT1:active" "PJY103_mDNMT1:inactive"
[3] "PJY103_mDNMT1:premature_termination" "PJY177_RNF2:active"
[5] "PJY177_RNF2:inactive" "PJY177_RNF2:premature_termination"
[7] "PJY179_FANCF:active" "PJY179_FANCF:inactive"
[9] "PJY179_FANCF:premature_termination" "PJY181_HEK3:active"
[11] "PJY181_HEK3:inactive" "PJY181_HEK3:premature_termination"
[13] "PJY182_HEK3:active" "PJY182_HEK3:inactive"
[15] "PJY182_HEK3:premature_termination" "PJY183_DNMT1:active"
[17] "PJY183_DNMT1:inactive" "PJY183_DNMT1:premature_termination"
[19] "PJY184_DNMT1:active" "PJY184_DNMT1:inactive"
[21] "PJY184_DNMT1:premature_termination" "PJY185_RUNX1:active"
[23] "PJY185_RUNX1:inactive" "PJY185_RUNX1:premature_termination"
[25] "PJY186_RUNX1:active" "PJY186_RUNX1:inactive"
[27] "PJY186_RUNX1:premature_termination" "PJY187_VEGFA:active"
[29] "PJY187_VEGFA:inactive" "PJY187_VEGFA:premature_termination"
[31] "PJY300_mDNMT1:active" "PJY300_mDNMT1:inactive"
[33] "PJY300_mDNMT1:premature_termination" "PJY302_EMX1:active"
[35] "PJY302_EMX1:inactive" "PJY302_EMX1:premature_termination"
[37] "PJY306_EMX1:active" "PJY306_EMX1:inactive"
[39] "PJY306_EMX1:premature_termination"
Sample comparisons
Heatmap of sample-to-sample distances
Code
sample_dists <- dist (t (assay (vsd)))
sample_dist_matrix <- as.matrix (sample_dists)
rownames (sample_dist_matrix) <- paste (vsd$ cell_line,
vsd$ exogenous_rna,
vsd$ day,
sep = "-"
)
colnames (sample_dist_matrix) <- NULL
Code
colors <- colorRampPalette (rev (brewer.pal (9 , "Blues" )))(255 )
pheatmap (sample_dist_matrix,
clustering_distance_rows = sample_dists,
clustering_distance_cols = sample_dists,
col = colors
)
PCA plot of samples
Code
plotPCA (vsd, intgroup = c ("cell_line" , "exogenous_rna" , "day" ))
Differential Expression
Batch 2 - Day 1
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch2_day1 <- subset (dds, select = colData (dds)$ batch == "batch2" &
colData (dds)$ day == "day1" )
dds_batch2_day1$ exogenous_rna <- droplevels (dds_batch2_day1$ exogenous_rna)
dds_batch2_day1$ day <- droplevels (dds_batch2_day1$ day)
design (dds_batch2_day1) <- ~ exogenous_rna + cell_line
dds_batch2_day1 <- DESeq (dds_batch2_day1, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56726 16
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day1_rep1
P1E10_sorted_PJY103_pegRNA_day1_rep2 ...
Parental_PJY300_epegRNA_day1_rep3 Parental_PJY300_epegRNA_day1_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day1 <- results (dds_batch2_day1, alpha = 0.05 )
res_batch2_day1
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56726 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
GAS5 3358381 0.1558718 0.0335418 4.647095
SNORD30 1605167 0.0910242 0.0576592 1.578660
SNORD49A 1156164 0.1842127 0.0436446 4.220748
SNORD26 857547 0.0954226 0.0539986 1.767132
SNHG1 780897 -0.0217632 0.0400343 -0.543614
... ... ... ... ...
PJY302_EMX1:inactive 0 NA NA NA
PJY302_EMX1:premature_termination 0 NA NA NA
PJY306_EMX1:active 0 NA NA NA
PJY306_EMX1:inactive 0 NA NA NA
PJY306_EMX1:premature_termination 0 NA NA NA
pvalue padj
<numeric> <numeric>
GAS5 3.36642e-06 7.69448e-05
SNORD30 1.14414e-01 4.51749e-01
SNORD49A 2.43493e-05 4.79464e-04
SNORD26 7.72060e-02 3.61519e-01
SNHG1 5.86707e-01 8.79696e-01
... ... ...
PJY302_EMX1:inactive NA NA
PJY302_EMX1:premature_termination NA NA
PJY306_EMX1:active NA NA
PJY306_EMX1:inactive NA NA
PJY306_EMX1:premature_termination NA NA
Code
out of 51621 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 937, 1.8%
LFC < 0 (down) : 1749, 3.4%
outliers [1] : 1, 0.0019%
low counts [2] : 23895, 46%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch2_day1_lfc <- lfcShrink (dds_batch2_day1,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch2_day1_lfc_df <- res_batch2_day1_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch2_day1_lfc_df, file = "diffexp_results_batch2_day1.tsv" )
res_batch2_day1_lfc_labelled <- res_batch2_day1_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch2_day1_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, colour = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch2_day1_lfc_labelled,
aes (label = gene, segment.colour = "black" ),
min.segment.length = 0 ,
show.legend = FALSE
) +
theme_bw ()
Warning: Removed 5105 rows containing missing values (geom_point).
Counts of exogenous rna
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch2_day1_lfc_labelled$ gene) {
gene_prefix <- str_split (gene, "_" )[[1 ]][1 ]
d <- plotCounts (dds_batch2_day1,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (gene_prefix, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list)
Batch 2 - Day 2
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch2_day2 <- subset (dds, select = colData (dds)$ batch == "batch2" &
colData (dds)$ day == "day2" )
dds_batch2_day2$ exogenous_rna <- droplevels (dds_batch2_day2$ exogenous_rna)
dds_batch2_day2$ day <- droplevels (dds_batch2_day2$ day)
design (dds_batch2_day2) <- ~ exogenous_rna + cell_line
dds_batch2_day2 <- DESeq (dds_batch2_day2, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56726 16
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(16): P1E10_sorted_PJY103_pegRNA_day2_rep1
P1E10_sorted_PJY103_pegRNA_day2_rep2 ...
Parental_PJY300_epegRNA_day2_rep3 Parental_PJY300_epegRNA_day2_rep4
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch2_day2 <- results (dds_batch2_day2, alpha = 0.05 )
res_batch2_day2
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56726 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
GAS5 3192415 0.18567762 0.0302890 6.130190
SNORD30 1514330 0.15582664 0.0511819 3.044562
SNORD49A 1170290 0.19302856 0.0427795 4.512171
SNORD26 825436 0.09512922 0.0397968 2.390377
SNHG1 678111 0.00437697 0.0311162 0.140665
... ... ... ... ...
PJY302_EMX1:inactive 0 NA NA NA
PJY302_EMX1:premature_termination 0 NA NA NA
PJY306_EMX1:active 0 NA NA NA
PJY306_EMX1:inactive 0 NA NA NA
PJY306_EMX1:premature_termination 0 NA NA NA
pvalue padj
<numeric> <numeric>
GAS5 8.77739e-10 3.21830e-08
SNORD30 2.33019e-03 2.78025e-02
SNORD49A 6.41674e-06 1.40180e-04
SNORD26 1.68311e-02 1.34207e-01
SNHG1 8.88134e-01 9.77382e-01
... ... ...
PJY302_EMX1:inactive NA NA
PJY302_EMX1:premature_termination NA NA
PJY306_EMX1:active NA NA
PJY306_EMX1:inactive NA NA
PJY306_EMX1:premature_termination NA NA
Code
out of 50454 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 875, 1.7%
LFC < 0 (down) : 1593, 3.2%
outliers [1] : 53, 0.11%
low counts [2] : 24295, 48%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch2_day2_lfc <- lfcShrink (dds_batch2_day2,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch2_day2_lfc_df <- res_batch2_day2_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch2_day2_lfc_df, file = "diffexp_results_batch2_day2.tsv" )
res_batch2_day2_lfc_labelled <- res_batch2_day2_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch2_day2_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, color = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch2_day2_lfc_labelled,
aes (label = gene, segment.colour = "black" ),
min.segment.length = 0 ,
show.legend = FALSE
) +
theme_bw ()
Warning: Removed 6272 rows containing missing values (geom_point).
Counts of exogenous rna
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch2_day2_lfc_labelled$ gene) {
gene_prefix <- str_split (gene, "_" )[[1 ]][1 ]
d <- plotCounts (dds_batch2_day2,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (gene_prefix, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list)
Batch 3 - Day 1
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch3_day1 <- subset (dds, select = colData (dds)$ batch == "batch3" &
colData (dds)$ day == "day1" )
dds_batch3_day1$ exogenous_rna <- droplevels (dds_batch3_day1$ exogenous_rna)
dds_batch3_day1$ day <- droplevels (dds_batch3_day1$ day)
design (dds_batch3_day1) <- ~ exogenous_rna + cell_line
dds_batch3_day1 <- DESeq (dds_batch3_day1, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56726 12
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day1_rep1
Parental_mastermix1_day1_rep2 ... P1E10_sorted_mastermix2_day1_rep2
P1E10_sorted_mastermix2_day1_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day1 <- results (dds_batch3_day1, alpha = 0.05 )
res_batch3_day1
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56726 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
GAS5 3961072 0.2446362 0.0339371 7.208511
SNORD30 1821940 0.2591363 0.0384223 6.744423
SNORD49A 1512020 0.2499211 0.0373335 6.694278
SNORD26 978999 0.2163275 0.0334244 6.472148
SNHG1 822970 0.0132365 0.0371975 0.355845
... ... ... ... ...
PJY302_EMX1:inactive 5291.362 -1.733848 0.0911872 -19.01417
PJY302_EMX1:premature_termination 678.258 -0.942775 0.0920436 -10.24270
PJY306_EMX1:active 3244.590 0.822682 0.0699691 11.75778
PJY306_EMX1:inactive 557.327 -0.116098 0.0758092 -1.53145
PJY306_EMX1:premature_termination 101.022 -1.211226 0.1573306 -7.69861
pvalue padj
<numeric> <numeric>
GAS5 5.65670e-13 2.11559e-11
SNORD30 1.53637e-11 5.08829e-10
SNORD49A 2.16740e-11 7.07169e-10
SNORD26 9.66192e-11 2.93474e-09
SNHG1 7.21957e-01 9.28340e-01
... ... ...
PJY302_EMX1:inactive 1.30172e-80 4.14871e-78
PJY302_EMX1:premature_termination 1.27619e-24 1.03560e-22
PJY306_EMX1:active 6.44051e-32 6.68082e-30
PJY306_EMX1:inactive 1.25658e-01 4.63556e-01
PJY306_EMX1:premature_termination 1.37558e-14 5.81737e-13
Code
out of 49768 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 862, 1.7%
LFC < 0 (down) : 1733, 3.5%
outliers [1] : 1, 0.002%
low counts [2] : 27776, 56%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch3_day1_lfc <- lfcShrink (dds_batch3_day1,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch3_day1_lfc_df <- res_batch3_day1_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch3_day1_lfc_df, file = "diffexp_results_batch3_day1.tsv" )
res_batch3_day1_lfc_labelled <- res_batch3_day1_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch3_day1_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, colour = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch3_day1_lfc_labelled,
aes (label = gene, segment.colour = "black" ),
min.segment.length = 0 ,
max.overlaps = 30 ,
size = 2.5 ,
show.legend = FALSE
) +
theme_bw ()
Warning: Removed 6958 rows containing missing values (geom_point).
Counts of exogenous rna
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch3_day1_lfc_labelled$ gene) {
gene_rna_type <- str_split (gene, ":" )[[1 ]][1 ]
exogenous_rna <- rna_mixes %>%
dplyr:: filter (rna_species == gene_rna_type) %>%
pull (exogenous_rna)
exogenous_rna_regex <- paste (exogenous_rna, collapse = "|" )
d <- plotCounts (dds_batch3_day1,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (exogenous_rna_regex, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list, ncol = 3 )
Batch 3 - Day 2
Select subset of samples then calculate the log2 fold change: cell line P1E10 vs Parental
Code
dds_batch3_day2 <- subset (dds, select = colData (dds)$ batch == "batch3" &
colData (dds)$ day == "day2" )
dds_batch3_day2$ exogenous_rna <- droplevels (dds_batch3_day2$ exogenous_rna)
dds_batch3_day2$ day <- droplevels (dds_batch3_day2$ day)
design (dds_batch3_day2) <- ~ exogenous_rna + cell_line
dds_batch3_day2 <- DESeq (dds_batch3_day2, parallel = TRUE )
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
class: DESeqDataSet
dim: 56726 12
metadata(1): version
assays(4): counts mu H cooks
rownames(56726): GAS5 SNORD30 ... PJY306_EMX1:inactive
PJY306_EMX1:premature_termination
rowData names(26): baseMean baseVar ... deviance maxCooks
colnames(12): Parental_mastermix1_day2_rep1
Parental_mastermix1_day2_rep2 ... P1E10_sorted_mastermix2_day2_rep2
P1E10_sorted_mastermix2_day2_rep3
colData names(10): sample_unit sample_name ... fq2 sizeFactor
Code
res_batch3_day2 <- results (dds_batch3_day2, alpha = 0.05 )
res_batch3_day2
log2 fold change (MLE): cell line P1E10 vs Parental
Wald test p-value: cell line P1E10 vs Parental
DataFrame with 56726 rows and 6 columns
baseMean log2FoldChange lfcSE stat
<numeric> <numeric> <numeric> <numeric>
GAS5 4358510 0.366504 0.0344499 10.638768
SNORD30 2397541 0.329002 0.0411267 7.999713
SNORD49A 1753252 0.415111 0.0480745 8.634753
SNORD26 1116920 0.229311 0.0408223 5.617289
SNHG1 818061 0.037861 0.0420172 0.901082
... ... ... ... ...
PJY302_EMX1:inactive 7483.494 -1.4442936 0.1202005 -12.015708
PJY302_EMX1:premature_termination 610.148 0.0215212 0.0909814 0.236545
PJY306_EMX1:active 5055.971 0.8342056 0.0894112 9.329987
PJY306_EMX1:inactive 673.181 0.0215212 0.0893793 0.240785
PJY306_EMX1:premature_termination 88.067 -0.4689319 0.1847407 -2.538325
pvalue padj
<numeric> <numeric>
GAS5 1.96712e-26 1.72482e-24
SNORD30 1.24710e-15 5.66381e-14
SNORD49A 5.88542e-18 3.19903e-16
SNORD26 1.93976e-08 4.78468e-07
SNHG1 3.67545e-01 7.16626e-01
... ... ...
PJY302_EMX1:inactive 2.93843e-33 3.44370e-31
PJY302_EMX1:premature_termination 8.13010e-01 9.48110e-01
PJY306_EMX1:active 1.05884e-20 7.06626e-19
PJY306_EMX1:inactive 8.09722e-01 9.47014e-01
PJY306_EMX1:premature_termination 1.11385e-02 7.40867e-02
Code
out of 50043 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 1167, 2.3%
LFC < 0 (down) : 2127, 4.3%
outliers [1] : 6, 0.012%
low counts [2] : 26012, 52%
(mean count < 2)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Log fold change shrinkage
Run log fold change shrinkage to account for low counts.
Code
res_batch3_day2_lfc <- lfcShrink (dds_batch3_day2,
coef = "cell_line_P1E10_vs_Parental" ,
type = "apeglm" , parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
sequence count data: removing the noise and preserving large differences.
Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA Plot (lfc shrunk)
MA Plot of shrunken log2 fold changes
Code
res_batch3_day2_lfc_df <- res_batch3_day2_lfc %>%
as.data.frame () %>%
rownames_to_column ("gene" ) %>%
dplyr:: mutate (significant = padj < 0.05 )
write_tsv (res_batch3_day2_lfc_df, file = "diffexp_results_batch3_day2.tsv" )
res_batch3_day2_lfc_labelled <- res_batch3_day2_lfc_df %>%
dplyr:: filter (gene %in% exogenous_rna_names) %>%
na.omit ()
ggplot (
res_batch3_day2_lfc_df,
aes (x = log2 (baseMean), y = log2FoldChange, color = significant)
) +
geom_point () +
ggrepel:: geom_label_repel (
data = res_batch3_day2_lfc_labelled,
mapping = aes (label = gene, segment.colour = "black" ),
min.segment.length = 0 ,
max.overlaps = 30 ,
size = 2.5 ,
show.legend = FALSE
) +
theme_bw ()
Warning: Removed 6683 rows containing missing values (geom_point).
Counts of exogenous rna
Plot of counts for a single gene (with lowest adjusted p-value)
Code
plot_list <- list ()
for (gene in res_batch3_day2_lfc_labelled$ gene) {
gene_rna_type <- str_split (gene, ":" )[[1 ]][1 ]
exogenous_rna <- rna_mixes %>%
dplyr:: filter (rna_species == gene_rna_type) %>%
pull (exogenous_rna)
exogenous_rna_regex <- paste (exogenous_rna, collapse = "|" )
d <- plotCounts (dds_batch3_day2,
gene = gene,
intgroup = "cell_line" ,
returnData = TRUE
)
p <- ggplot (
d %>%
rownames_to_column ("sample_unit" ) %>%
dplyr:: filter (grepl (exogenous_rna_regex, sample_unit)),
aes (x = cell_line, y = count)
) +
geom_point (position = position_jitter (width = 0.1 , height = 0 )) +
ggtitle (gene)
plot_list[[gene]] <- p
}
cowplot:: plot_grid (plotlist = plot_list, ncol = 3 )